Determine ranked spearman correlation between single cell RNASeq (scRNASeq) and bulk RNASeq (bRNASeq)
Creating new pipeline using seurat v4.0.2 available 2021.06.08
Load libraries required for Seuratv4
library(dplyr)
library(Seurat)
library(patchwork)
library(ggplot2)
library(SingleCellExperiment)
library(Matrix.utils)
library(DESeq2)
library(edgeR)
library(csaw)
library(tximport)
projectName <- "pseudobulk_CMP"
sessionInfo.filename <- paste0(projectName, "_sessionInfo.txt")
sink(sessionInfo.filename)
sessionInfo()
sink()
Using CMP.rds object of dim=25, describing 85% of variation
# CMP seurat object generated in "CMPSubset.Rmd"
seurat.object <- readRDS("CMP_dim25.RDS")
p <- FeaturePlot(seurat.object, features = c("Fli1", "Klf1"), blend = TRUE, pt.size = 1, combine = TRUE, keep.scale = "all")
plot(p)
png(filename = "CMP_Fli1Klf1Ratio.png", width = 2400, height = 400)
plot(p)
dev.off()
quartz_off_screen
3
p <- FeaturePlot(seurat.object, features = c("Gata2", "Gata1"), blend = TRUE, pt.size = 1, combine = TRUE, keep.scale = "all")
plot(p)
png(filename = "CMP_Gata2Gata1Ratio.png", width = 2400, height = 400)
plot(p)
dev.off()
quartz_off_screen
3
__ Based on tutorial @ https://hbctraining.github.io/scRNA-seq/lessons/pseudobulk_DESeq2_scrnaseq.html __
# Extract raw counts and metadata to create SingleCellExperiment object
counts <- seurat.object@assays$RNA@counts
metadata <- seurat.object@meta.data
# Set up metadata as desired for aggregation and DE analysis
metadata$cluster_id <- factor(seurat.object@active.ident)
# Create single cell experiment object
sce <- SingleCellExperiment(assays = list(counts = counts),
colData = metadata)
Investigate the object
print(dim(counts(sce)))
[1] 16278 12059
print(dim(colData(sce)))
[1] 12059 12
print(head(colData(sce)))
DataFrame with 6 rows and 12 columns
orig.ident nCount_RNA nFeature_RNA percent.mt clust.ID RNA_snn_res.0.5 seurat_clusters RNA_snn_res.1 RNA_snn_res.1.5 RNA_snn_res.2 RNA_snn_res.2.5
<factor> <numeric> <integer> <numeric> <numeric> <factor> <factor> <factor> <factor> <factor> <factor>
CMPm2_AAACATACAAGCCT-1 CMPm2 2731 1258 2.30685 11 5 9 6 7 10 9
CMPm2_AAACATACACCCTC-1 CMPm2 16980 3743 2.30271 0 5 25 6 19 24 25
CMPm2_AAACATACACTGTG-1 CMPm2 5920 2083 2.71959 0 3 10 11 14 18 10
CMPm2_AAACATACAGAAGT-1 CMPm2 6314 2115 2.54989 0 2 4 0 5 4 4
CMPm2_AAACATACAGAGGC-1 CMPm2 4749 1861 2.65319 17 3 14 13 11 21 14
CMPm2_AAACATACCCAACA-1 CMPm2 13138 3549 2.55747 11 1 17 14 15 19 17
cluster_id
<factor>
CMPm2_AAACATACAAGCCT-1 9
CMPm2_AAACATACACCCTC-1 25
CMPm2_AAACATACACTGTG-1 10
CMPm2_AAACATACAGAAGT-1 4
CMPm2_AAACATACAGAGGC-1 14
CMPm2_AAACATACCCAACA-1 17
Note that QC was performed during Seurat analysis
sce <- calculateQCMetrics(sce)
Error: 'calculateQCMetrics' is defunct.
Use 'perCellQCMetrics' instead.
See help("Defunct")
groups <- colData(sce)[, "orig.ident"]
aggr.counts <- aggregate.Matrix(t(counts(sce)), groupings = groups, fun = "sum")
dim(aggr.counts)
[1] 1 16278
sc.dds <- DESeqDataSetFromMatrix(countData = aggr.counts, design = ~orig.ident)
Error in h(simpleError(msg, call)) :
error in evaluating the argument 'x' in selecting a method for function 'nrow': argument "colData" is missing, with no default
# import rsem.genes.results files
setwd("../../bRNASeq/")
Warning: The working directory was changed to /Users/heustonef/Desktop/10XGenomicsData/bRNASeq inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the working directory for notebook chunks.
file_list <- list.files(pattern = "^CMP", recursive = TRUE, include.dirs = TRUE)
names(file_list) <- lapply(file_list, function(x) gsub('.genes.results', '', x))
file_list
ScriptSeq/CMP1009 ScriptSeq/CMP1010 TruSeq/CMP443 TruSeq/CMP448
"ScriptSeq/CMP1009.genes.results" "ScriptSeq/CMP1010.genes.results" "TruSeq/CMP443.genes.results" "TruSeq/CMP448.genes.results"
mstxi <- tximport(files = paste0("../../bRNASeq/", file_list), type = 'rsem', txIn = FALSE, txOut = FALSE, geneIdCol = 3, )
reading in files with read_tsv
1 2 3 4
colnames(mstxi$counts) <- gsub("^.*/", "", names(file_list))
rna.counts <- merge(mstxi$counts, t(aggr.counts), by = "row.names", all = TRUE)
rna.counts[is.na(rna.counts)]<-0
rownames(rna.counts) <- rna.counts$Row.names
rna.counts <- rna.counts[, 2:ncol(rna.counts)]
head(rna.counts)
# Set reference sample =-----------------------------------------------------------
dgelist_groups <- factor(c(unlist(stringr::str_extract_all(gsub("/.*", "", names(file_list)), pattern = "[a-zA-Z]+")), "SingleCell"))
# dgelist_groups <- relevel(dgelist_groups, ref = "CMP")
levels(dgelist_groups)
[1] "ScriptSeq" "SingleCell" "TruSeq"
dgelist <- DGEList(counts = rna.counts, group = dgelist_groups)
dgelist
An object of class "DGEList"
$counts
CMP1009 CMP1010 CMP443 CMP448 CMPm2
0610005C13Rik 0 0 0 0 31
0610007P14Rik 138 92 244 459 0
0610009B22Rik 0 16 118 75 5544
0610009E02Rik 0 0 0 0 144
0610009L18Rik 0 7 0 32 1106
27030 more rows ...
$samples
apply(dgelist$counts, 2, sum)
CMP1009 CMP1010 CMP443 CMP448 CMPm2
1705279 5065044 31763431 28668883 92325072
keep <- rowSums(cpm(dgelist) > 10) >=2
dgelist <- dgelist[keep,]
dgelist$samples$lib.size <- colSums(dgelist$counts)
dgelist <- calcNormFactors(dgelist)
designMat
designMat <- model.matrix(~ 0 + dgelist$samples$group)
colnames(designMat) <- levels(dgelist$samples$group)
designMat
ScriptSeq SingleCell TruSeq
1 1 0 0
2 1 0 0
3 0 0 1
4 0 0 1
5 0 1 0
attr(,"assign")
[1] 1 1 1
attr(,"contrasts")
attr(,"contrasts")$`dgelist$samples$group`
[1] "contr.treatment"
estimate GLM dispersion and apply
dgelist <- estimateGLMCommonDisp(dgelist, designMat)
dgelist <- estimateGLMTrendedDisp(dgelist, designMat, method = 'bin.spline')
dgelist <- estimateGLMTagwiseDisp(dgelist, designMat)
fit <- glmQLFit(dgelist, designMat, robust = TRUE)
plotQLDisp(fit)
qlf <- glmQLFTest(fit, coef = 1:2)
summary(decideTests(qlf))
SingleCell-ScriptSeq
NotSig 1
Sig 10024
rld <- rlog(round(dgelist$counts), blind = TRUE)
converting counts to integer mode
rld <- as.data.frame(rld)
rld.spear <- cor(rld, method = "spearman")
rld.melt <- reshape2::melt(rld)
No id variables; using all as measure variables
colnames(rld.melt) <- c("sample", "rld")
rld.spear
CMP1009 CMP1010 CMP443 CMP448 CMPm2
CMP1009 1.0000000 0.9007321 0.7848357 0.7830653 0.6121268
CMP1010 0.9007321 1.0000000 0.8423723 0.8411575 0.6431091
CMP443 0.7848357 0.8423723 1.0000000 0.9684792 0.7156723
CMP448 0.7830653 0.8411575 0.9684792 1.0000000 0.7286673
CMPm2 0.6121268 0.6431091 0.7156723 0.7286673 1.0000000
ggplot(data = rld.melt, aes(x = sample, y = rld, fill = sample)) + geom_violin()
ggplot(data = rld, aes(x = CMPm2, y = CMP1010)) + geom_point()
pool.counts <- dgelist$counts
pool.counts <- as.data.frame(pool.counts)
pool.counts$ScriptSeq <- (pool.counts$CMP1009 + pool.counts$CMP1010)/2
pool.counts$TruSeq <- (pool.counts$CMP443 + pool.counts$CMP448)/2
pool.counts <- subset(pool.counts, select = c(ScriptSeq, TruSeq, CMPm2))
pool.counts <- as.matrix(pool.counts)
pool.rld <- rlog(round((pool.counts)), blind = TRUE)
converting counts to integer mode
-- note: fitType='parametric', but the dispersion trend was not well captured by the
function: y = a/x + b, and a local regression fit was automatically substituted.
specify fitType='local' or 'mean' to avoid this message next time.
pool.rld.spear <- cor(pool.rld, method = "spearman")
pool.rld.melt <- reshape2::melt(pool.rld)
colnames(pool.rld.melt) <- c("sample", "rld")
pool.rld.spear
ScriptSeq TruSeq CMPm2
ScriptSeq 1.0000000 0.7877211 0.5488382
TruSeq 0.7877211 1.0000000 0.6419253
CMPm2 0.5488382 0.6419253 1.0000000
ggplot(data = pool.rld.melt, aes(x = sample, y = rld, fill = sample)) + geom_violin()
ggplot(data = pool.rld.melt, aes(x = CMPm2, y = ScriptSeq)) + geom_point()
Error in FUN(X[[i]], ...) : object 'CMPm2' not found
Combine replicates from CPM table
rna.cpm <- as.data.frame(cpm(dgelist, log = TRUE, normalized.lib.sizes = TRUE))
rna.cpm$ScriptSeq <- (rna.cpm$CMP1009 + rna.cpm$CMP1010)/2
rna.cpm$TruSeq <- (rna.cpm$CMP443 + rna.cpm$CMP448)/2
rna.cpm <- subset(rna.cpm, select = c(ScriptSeq, TruSeq, CMPm2))
plot correlation and scatter grams
cpm.spear <- cor(rna.cpm, method = "spearman")
cpm.melt <- reshape2::melt(rld)
No id variables; using all as measure variables
colnames(cpm.melt) <- c("sample", "rld")
cpm.spear
ScriptSeq TruSeq CMPm2
ScriptSeq 1.0000000 0.6132299 0.2978549
TruSeq 0.6132299 1.0000000 0.4450933
CMPm2 0.2978549 0.4450933 1.0000000
sc.dds <- DESeqDataSetFromMatrix(countData = aggr.counts, colData = metadata, design = ~orig.ident)